import os
import re
from typing import Literal

os.environ["CUDA_VISIBLE_DEVICES"] = "0"


def draw_bbox_qwen2_vl(image, response, norm_bbox: Literal["norm1000", "none"]):
    matches = re.findall(
        r"<\|object_ref_start\|>(.*?)<\|object_ref_end\|><\|box_start\|>\((\d+),(\d+)\),\((\d+),(\d+)\)<\|box_end\|>",
        response,
    )
    ref = []
    bbox = []
    for match_ in matches:
        ref.append(match_[0])
        bbox.append(list(match_[1:]))
    draw_bbox(image, ref, bbox, norm_bbox=norm_bbox)


def infer_grounding():
    from swift.llm import (
        PtEngine,
        RequestConfig,
        BaseArguments,
        InferRequest,
        safe_snapshot_download,
    )

    output_path = "bbox.png"
    image = load_image(
        "http://modelscope-open.oss-cn-hangzhou.aliyuncs.com/images/animal.png"
    )
    infer_request = InferRequest(
        messages=[{"role": "user", "content": "Task: Object Detection"}], images=[image]
    )

    request_config = RequestConfig(max_tokens=512, temperature=0)
    adapter_path = safe_snapshot_download("swift/test_grounding")
    args = BaseArguments.from_pretrained(adapter_path)

    engine = PtEngine(args.model, adapters=[adapter_path])
    resp_list = engine.infer([infer_request], request_config)
    response = resp_list[0].choices[0].message.content
    print(f"lora-response: {response}")

    draw_bbox_qwen2_vl(image, response, norm_bbox=args.norm_bbox)
    print(f"output_path: {output_path}")
    image.save(output_path)


if __name__ == "__main__":
    from swift.llm import draw_bbox, load_image

    infer_grounding()
